-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
clang/AMDGPU: Emit atomicrmw from ds_fadd builtins #95395
clang/AMDGPU: Emit atomicrmw from ds_fadd builtins #95395
Conversation
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWe should have done this for the f32/f64 case a long time ago. Now that This also does upgrade the behavior to respect a volatile qualified pointer, Patch is 20.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95395.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 511e1fd4016d7..d81cf40c912de 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
break;
}
+ // Some of the atomic builtins take the scope as a string name.
StringRef scp;
- llvm::getConstantStringInfo(Scope, scp);
- SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
+ if (llvm::getConstantStringInfo(Scope, scp)) {
+ SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
+ return;
+ }
+
+ // Older builtins had an enum argument for the memory scope.
+ int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
+ switch (scope) {
+ case 0: // __MEMORY_SCOPE_SYSTEM
+ SSID = llvm::SyncScope::System;
+ break;
+ case 1: // __MEMORY_SCOPE_DEVICE
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+ break;
+ case 2: // __MEMORY_SCOPE_WRKGRP
+ SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
+ break;
+ case 3: // __MEMORY_SCOPE_WVFRNT
+ SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+ break;
+ case 4: // __MEMORY_SCOPE_SINGLE
+ SSID = llvm::SyncScope::SingleThread;
+ break;
+ default:
+ SSID = llvm::SyncScope::System;
+ break;
+ }
}
llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
@@ -18558,14 +18584,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
}
- case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
Intrinsic::ID Intrin;
switch (BuiltinID) {
- case AMDGPU::BI__builtin_amdgcn_ds_faddf:
- Intrin = Intrinsic::amdgcn_ds_fadd;
- break;
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
Intrin = Intrinsic::amdgcn_ds_fmin;
break;
@@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
return Builder.CreateCall(F, {Addr, Val});
}
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
- Intrinsic::ID IID;
- llvm::Type *ArgTy;
- switch (BuiltinID) {
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
- IID = Intrinsic::amdgcn_ds_fadd;
- break;
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
- ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
- IID = Intrinsic::amdgcn_ds_fadd;
- break;
- case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 2);
- IID = Intrinsic::amdgcn_ds_fadd;
- break;
- }
- llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
- llvm::Value *Val = EmitScalarExpr(E->getArg(1));
- llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
- llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
- llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
- llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
- llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
- return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
- }
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19044,7 +19037,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
- case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
+ case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
+ case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19055,23 +19053,54 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
BinOp = llvm::AtomicRMWInst::UDecWrap;
break;
+ case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+ BinOp = llvm::AtomicRMWInst::FAdd;
+ break;
}
Address Ptr = CheckAtomicAlignment(*this, E);
Value *Val = EmitScalarExpr(E->getArg(1));
+ llvm::Type *OrigTy = Val->getType();
+ QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
- ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
- EmitScalarExpr(E->getArg(3)), AO, SSID);
+ bool Volatile;
- QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
- bool Volatile =
- PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
+ if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
+ // __builtin_amdgcn_ds_faddf has an explicit volatile argument
+ Volatile =
+ cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
+ } else {
+ // Infer volatile from the passed type.
+ Volatile =
+ PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
+ }
+
+ if (E->getNumArgs() >= 4) {
+ // Some of the builtins have explicit ordering and scope arguments.
+ ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
+ EmitScalarExpr(E->getArg(3)), AO, SSID);
+ } else {
+ // The ds_fadd_* builtins do not have syncscope/order arguments.
+ SSID = llvm::SyncScope::System;
+ AO = AtomicOrdering::SequentiallyConsistent;
+
+ // The v2bf16 builtin uses i16 instead of a natural bfloat type.
+ if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
+ llvm::Type *V2BF16Ty = FixedVectorType::get(
+ llvm::Type::getBFloatTy(Builder.getContext()), 2);
+ Val = Builder.CreateBitCast(Val, V2BF16Ty);
+ }
+ }
llvm::AtomicRMWInst *RMW =
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
if (Volatile)
RMW->setVolatile(true);
- return RMW;
+ return Builder.CreateBitCast(RMW, OrigTy);
}
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 1fc2fb99260f4..132cbd27b08fc 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -115,7 +115,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 8dbb8c538ddc1..7bb756a4a2731 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -112,7 +112,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
diff --git a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu
index 66ec200a8e6d4..03b39cd1291c7 100644
--- a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu
+++ b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu
@@ -11,9 +11,10 @@ typedef __attribute__((address_space(3))) float *LP;
// CHECK: store ptr %addr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
// CHECK: %[[ADDR_ADDR_ASCAST:.*]] = load ptr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
// CHECK: %[[AS_CAST:.*]] = addrspacecast ptr %[[ADDR_ADDR_ASCAST]] to ptr addrspace(3)
-// CHECK: %3 = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %[[AS_CAST]]
+// CHECK: [[TMP2:%.+]] = load float, ptr %val.addr.ascast, align 4
+// CHECK: [[TMP3:%.+]] = atomicrmw fadd ptr addrspace(3) %[[AS_CAST]], float [[TMP2]] monotonic, align 4
// CHECK: %4 = load ptr, ptr %rtn.ascast, align 8
-// CHECK: store float %3, ptr %4, align 4
+// CHECK: store float [[TMP3]], ptr %4, align 4
__device__ void test_ds_atomic_add_f32(float *addr, float val) {
float *rtn;
*rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0);
diff --git a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu
index 1ea1d5f454762..e01d9d7efed27 100644
--- a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu
+++ b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu
@@ -20,7 +20,7 @@ typedef __attribute__((address_space(3))) float *LP;
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false)
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8
// CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
// CHECK-NEXT: ret void
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index ea2aedf8d44a5..3fe1c11828dd6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -117,13 +117,44 @@ void test_update_dpp(global int* out, int arg1, int arg2)
}
// CHECK-LABEL: @test_ds_fadd
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+// CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src release, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
+
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
#if !defined(__SPIRV__)
void test_ds_faddf(local float *out, float src) {
#else
-void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
+ void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
#endif
+
*out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, true);
+
+ // Test all orders.
+ *out = __builtin_amdgcn_ds_faddf(out, src, 1, 0, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 2, 0, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 3, 0, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 4, 0, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 5, 0, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 5, 0, false); // invalid
+
+ // Test all syncscopes.
+ *out = __builtin_amdgcn_ds_faddf(out, src, 0, 1, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 0, 2, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 0, 3, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 0, 4, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, 0, 5, false); // invalid
}
// CHECK-LABEL: @test_ds_fmin
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 0b4038a2adc55..63381942eaba5 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -10,7 +10,10 @@ typedef half __attribute__((ext_vector_type(2))) half2;
typedef short __attribute__((ext_vector_type(2))) short2;
// CHECK-LABEL: test_local_add_2bf16
-// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
+// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
+
// GFX12-LABEL: test_local_add_2bf16
// GFX12: ds_pk_add_rtn_bf16
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
@@ -18,7 +21,10 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
}
// CHECK-LABEL: test_local_add_2bf16_noret
-// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
+// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
+
// GFX12-LABEL: test_local_add_2bf16_noret
// GFX12: ds_pk_add_bf16
void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
@@ -26,7 +32,7 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
}
// CHECK-LABEL: test_local_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX12-LABEL: test_local_add_2f16
// GFX12: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
@@ -34,7 +40,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) {
}
// CHECK-LABEL: test_local_add_2f16_noret
-// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX12-LABEL: test_local_add_2f16_noret
// GFX12: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
index 823316da20df7..ad4d0b7af3d4b 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
@@ -6,7 +6,7 @@
// REQUIRES: amdgpu-registered-target
// CHECK-LABEL: test_fadd_local
-// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}}, i32 0, i32 0, i1 false)
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// GFX8-LABEL: test_fadd_local$local:
// GFX8: ds_add_rtn_f32 v2, v0, v1
// GFX8: s_endpgm
@@ -14,3 +14,10 @@ kernel void test_fadd_local(__local float *ptr, float val){
float *res;
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
}
+
+// CHECK-LABEL: test_fadd_local_volatile
+// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
+ volatile float *res;
+ *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 8e816509341d4..e2117f11858f7 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
}
// CHECK-LABEL: test_ds_add_local_f64
-// CHECK: call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %{{.*}}, double %{{.*}},
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
// GFX90A: test_ds_add_local_f64$local
// GFX90A: ds_add_rtn_f64
void test_ds_add_local_f64(__local double *addr, double x){
@@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
}
// CHECK-LABEL: test_ds_addf_local_f32
-// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}},
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// GFX90A-LABEL: test_ds_addf_local_f32$local
// GFX90A: ds_add_rtn_f32
void test_ds_addf_local_f32(__local float *addr, float x){
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index e415a95eadd27..92a33ceac2290 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -42,7 +42,11 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
}
// CHECK-LABEL: test_local_add_2bf16
-// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+
+// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
+// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
+
// GFX940-LABEL: test_local_add_2bf16
// GFX940: ds_pk_add_rtn_bf16
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
@@ -50,7 +54,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
}
// CHECK-LABEL: test_local_add_2f16
-// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX940-LABEL: test_local_add_2f16
// GFX940: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
@@ -58,7 +62,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) {
}
// CHECK-LABEL: test_local_add_2f16_noret
-// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX940-LABEL: test_local_add_2f16_noret
// GFX940: ds_pk_add_f16
void test_local_add_2f16_noret(__local half...
[truncated]
|
8eb6914
to
c516231
Compare
e176918
to
b6fa394
Compare
c516231
to
a307ee0
Compare
b6fa394
to
0bfa259
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks fairly straightforward with those prerequisites.
a307ee0
to
25ea24e
Compare
We should have done this for the f32/f64 case a long time ago. Now that codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting it instead. This also does upgrade the behavior to respect a volatile qualified pointer, which was previously ignored (for the cases that don't have an explicit volatile argument).
0bfa259
to
35c741f
Compare
These builtins generate atomic instructions in IR but the builtin function name does not have atomic. Is that a concern? Should they be renamed with atomic in name? |
We should have done this for the f32/f64 case a long time ago. Now that codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting it instead. This also does upgrade the behavior to respect a volatile qualified pointer, which was previously ignored (for the cases that don't have an explicit volatile argument).
We should have done this for the f32/f64 case a long time ago. Now that
codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting
it instead.
This also does upgrade the behavior to respect a volatile qualified pointer,
which was previously ignored (for the cases that don't have an explicit
volatile argument).